#import all the necessary packages.
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout
plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
#The json file contains the data we got from Amazon API
# loading the data using pandas' read_json file.
data = pd.read_json('../data/tops_fashion.json')
print ('Number of data points : ', data.shape[0], \
'Number of features/variables:', data.shape[1])
Number of data points : 183138 Number of features/variables: 19
# Each product/item has 19 features in the raw dataset.
data.columns
Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
'color', 'brand', 'publisher', 'availability', 'reviews',
'large_image_url', 'availability_type', 'small_image_url',
'editorial_review', 'title', 'model', 'medium_image_url',
'manufacturer', 'editorial_reivew'],
dtype='object')
Of these 19 features, we will be using only 6 features
1. asin ( Amazon standard identification number)
2. brand ( brand to which the product belongs to )
3. color ( Color information of apparel, it can contain many colors as a value ex: red and black stripes )
4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )
5. medium_image_url ( url of the image )
6. title (title of the product.)
7. formatted_price (price of the product)
We do this because,'author', 'publisher', 'availability', 'large_image_url', 'availability_type', 'small_image_url', 'editorial_review', 'model', 'medium_image_url', 'manufacturer', 'editorial_reivew' are quite irrelevant during recommendation. We got to know this through asking our friends on what they think is the feature that they look at when they scroll through the product
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]
print ('Number of data points : ', data.shape[0], 'Number of features:', data.shape[1])
data.head()
Number of data points : 183138 Number of features: 7
| asin | brand | color | medium_image_url | product_type_name | title | formatted_price | |
|---|---|---|---|---|---|---|---|
| 0 | B016I2TS4W | FNC7C | None | https://images-na.ssl-images-amazon.com/images... | SHIRT | Minions Como Superheroes Ironman Long Sleeve R... | None |
| 1 | B01N49AI08 | FIG Clothing | None | https://images-na.ssl-images-amazon.com/images... | SHIRT | FIG Clothing Womens Izo Tunic | None |
| 2 | B01JDPCOHO | FIG Clothing | None | https://images-na.ssl-images-amazon.com/images... | SHIRT | FIG Clothing Womens Won Top | None |
| 3 | B01N19U5H5 | Focal18 | None | https://images-na.ssl-images-amazon.com/images... | SHIRT | Focal18 Sailor Collar Bubble Sleeve Blouse Shi... | None |
| 4 | B004GSI2OS | FeatherLite | Onyx Black/ Stone | https://images-na.ssl-images-amazon.com/images... | SHIRT | Featherlite Ladies' Long Sleeve Stain Resistan... | $26.26 |
print(data['product_type_name'].describe())
count 183138 unique 72 top SHIRT freq 167794 Name: product_type_name, dtype: object
# names of different product types
print(data['product_type_name'].unique())
['SHIRT' 'SWEATER' 'APPAREL' 'OUTDOOR_RECREATION_PRODUCT' 'BOOKS_1973_AND_LATER' 'PANTS' 'HAT' 'SPORTING_GOODS' 'DRESS' 'UNDERWEAR' 'SKIRT' 'OUTERWEAR' 'BRA' 'ACCESSORY' 'ART_SUPPLIES' 'SLEEPWEAR' 'ORCA_SHIRT' 'HANDBAG' 'PET_SUPPLIES' 'SHOES' 'KITCHEN' 'ADULT_COSTUME' 'HOME_BED_AND_BATH' 'MISC_OTHER' 'BLAZER' 'HEALTH_PERSONAL_CARE' 'TOYS_AND_GAMES' 'SWIMWEAR' 'CONSUMER_ELECTRONICS' 'SHORTS' 'HOME' 'AUTO_PART' 'OFFICE_PRODUCTS' 'ETHNIC_WEAR' 'BEAUTY' 'INSTRUMENT_PARTS_AND_ACCESSORIES' 'POWERSPORTS_PROTECTIVE_GEAR' 'SHIRTS' 'ABIS_APPAREL' 'AUTO_ACCESSORY' 'NONAPPARELMISC' 'TOOLS' 'BABY_PRODUCT' 'SOCKSHOSIERY' 'POWERSPORTS_RIDING_SHIRT' 'EYEWEAR' 'SUIT' 'OUTDOOR_LIVING' 'POWERSPORTS_RIDING_JACKET' 'HARDWARE' 'SAFETY_SUPPLY' 'ABIS_DVD' 'VIDEO_DVD' 'GOLF_CLUB' 'MUSIC_POPULAR_VINYL' 'HOME_FURNITURE_AND_DECOR' 'TABLET_COMPUTER' 'GUILD_ACCESSORIES' 'ABIS_SPORTS' 'ART_AND_CRAFT_SUPPLY' 'BAG' 'MECHANICAL_COMPONENTS' 'SOUND_AND_RECORDING_EQUIPMENT' 'COMPUTER_COMPONENT' 'JEWELRY' 'BUILDING_MATERIAL' 'LUGGAGE' 'BABY_COSTUME' 'POWERSPORTS_VEHICLE_PART' 'PROFESSIONAL_HEALTHCARE' 'SEEDS_AND_PLANTS' 'WIRELESS_ACCESSORY']
# finding the 10 most frequent product_type_names.
product_type_count = Counter(list(data['product_type_name']))
product_type_count.most_common(10)
[('SHIRT', 167794),
('APPAREL', 3549),
('BOOKS_1973_AND_LATER', 3336),
('DRESS', 1584),
('SPORTING_GOODS', 1281),
('SWEATER', 837),
('OUTERWEAR', 796),
('OUTDOOR_RECREATION_PRODUCT', 729),
('ACCESSORY', 636),
('UNDERWEAR', 425)]
As we can see above, 'SHIRT' appeared a lot of times in the data (167k times) we procured. This is expected as shirt data is what we queried.
# there are 10577 unique brands
print(data['brand'].describe())
count 182987 unique 10577 top Zago freq 223 Name: brand, dtype: object
brand_count = Counter(list(data['brand']))
brand_count.most_common(10)
[('Zago', 223),
('XQS', 222),
('Yayun', 215),
('YUNY', 198),
('XiaoTianXin-women clothes', 193),
('Generic', 192),
('Boohoo', 190),
('Alion', 188),
('Abetteric', 187),
('TheMogan', 187)]
print(data['color'].describe())
count 64956 unique 7380 top Black freq 13207 Name: color, dtype: object
color_count = Counter(list(data['color']))
color_count.most_common(10)
[(None, 118182),
('Black', 13207),
('White', 8616),
('Blue', 3570),
('Red', 2289),
('Pink', 1842),
('Grey', 1499),
('*', 1388),
('Green', 1258),
('Multi', 1203)]
print(data['formatted_price'].describe())
count 28395 unique 3135 top $19.99 freq 945 Name: formatted_price, dtype: object
price_count = Counter(list(data['formatted_price']))
price_count.most_common(10)
[(None, 154743),
('$19.99', 945),
('$9.99', 749),
('$9.50', 601),
('$14.99', 472),
('$7.50', 463),
('$24.99', 414),
('$29.99', 370),
('$8.99', 343),
('$9.01', 336)]
print(data['title'].describe())
count 183138 unique 175985 top Nakoda Cotton Self Print Straight Kurti For Women freq 77 Name: title, dtype: object
#data.to_pickle('pickels/180k_apparel_data')
# considering products which have price information
# data['formatted_price'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
# so here with ~data['formatted_price'].isnull() we are taking the products which have price information
data = data.loc[~data['formatted_price'].isnull()]
# consider products which have color information
# data['color'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
# so here with ~data['color'].isnull() we are taking the products which have color information
data =data.loc[~data['color'].isnull()]
# find number of products that have duplicate titles.
print(sum(data.duplicated('title')))
2325
:B00AQ4GMCK |
:B00AQ4GMTS |
:B00AQ4GMLQ |
:B00AQ4GN3I |
:B00G278GZ6 |
:B00G278W6O |
:B00G278Z2A |
:B00G2786X8 |
# # read data from pickle file from previous stage
# data = pd.read_pickle('pickels/28k_apparel_data')
data.head()
| asin | brand | color | medium_image_url | product_type_name | title | formatted_price | |
|---|---|---|---|---|---|---|---|
| 4 | B004GSI2OS | FeatherLite | Onyx Black/ Stone | https://images-na.ssl-images-amazon.com/images... | SHIRT | Featherlite Ladies' Long Sleeve Stain Resistan... | $26.26 |
| 6 | B012YX2ZPI | HX-Kingdom Fashion T-shirts | White | https://images-na.ssl-images-amazon.com/images... | SHIRT | Women's Unique 100% Cotton T - Special Olympic... | $9.99 |
| 11 | B001LOUGE4 | Fitness Etc. | Black | https://images-na.ssl-images-amazon.com/images... | SHIRT | Ladies Cotton Tank 2x1 Ribbed Tank Top | $11.99 |
| 15 | B003BSRPB0 | FeatherLite | White | https://images-na.ssl-images-amazon.com/images... | SHIRT | FeatherLite Ladies' Moisture Free Mesh Sport S... | $20.54 |
| 21 | B014ICEDNA | FNC7C | Purple | https://images-na.ssl-images-amazon.com/images... | SHIRT | Supernatural Chibis Sam Dean And Castiel Short... | $7.50 |
# Our approach here is to remove All products with very few words in title
# if the title length is less than 5, then the title is not quite useful
# "short title is of size less than 5" here is random, no significant meaning to it
# ex: "shrt" title of a product does not give us much information about the product
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
# Sort the whole data based on title (alphabetical order of title)
data_sorted.sort_values('title',inplace=True, ascending=False)
data_sorted.head()
| asin | brand | color | medium_image_url | product_type_name | title | formatted_price | |
|---|---|---|---|---|---|---|---|
| 61973 | B06Y1KZ2WB | Éclair | Black/Pink | https://images-na.ssl-images-amazon.com/images... | SHIRT | Éclair Women's Printed Thin Strap Blouse Black... | $24.99 |
| 133820 | B010RV33VE | xiaoming | Pink | https://images-na.ssl-images-amazon.com/images... | SHIRT | xiaoming Womens Sleeveless Loose Long T-shirts... | $18.19 |
| 81461 | B01DDSDLNS | xiaoming | White | https://images-na.ssl-images-amazon.com/images... | SHIRT | xiaoming Women's White Long Sleeve Single Brea... | $21.58 |
| 75995 | B00X5LYO9Y | xiaoming | Red Anchors | https://images-na.ssl-images-amazon.com/images... | SHIRT | xiaoming Stripes Tank Patch/Bear Sleeve Anchor... | $15.91 |
| 151570 | B00WPJG35K | xiaoming | White | https://images-na.ssl-images-amazon.com/images... | SHIRT | xiaoming Sleeve Sheer Loose Tassel Kimono Woma... | $14.32 |
Titles 1: 16. woman's place is in the house and the senate shirts for Womens XXL White 17. woman's place is in the house and the senate shirts for Womens M Grey Title 2: 25. tokidoki The Queen of Diamonds Women's Shirt X-Large 26. tokidoki The Queen of Diamonds Women's Shirt Small 27. tokidoki The Queen of Diamonds Women's Shirt Large Title 3: 61. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt 62. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt 63. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt 64. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
indices = []
for i,row in data_sorted.iterrows():
indices.append(i)
import itertools
stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
previous_i = i
# store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
a = data['title'].loc[indices[i]].split()
# search for the similar products sequentially
j = i+1
while j < num_data_points:
# store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
b = data['title'].loc[indices[j]].split()
# store the maximum length of two strings
length = max(len(a), len(b))
# count is used to store the number of words that are matched in both strings
count = 0
for k in itertools.zip_longest(a,b):
if (k[0] == k[1]):
count += 1
# if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
# if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
if (length - count) > 2: # number of words in which both sensences differ
# if both strings are differ by more than 2 words we include the 1st string index
stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])
# start searching for similar apperals corresponds 2nd string
i = j
break
else:
j += 1
if previous_i == i:
break
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
In the previous cell, we sorted whole data in alphabetical order of titles.Then, we removed titles which are adjacent and very similar title But there are some products whose titles are not adjacent but very similar. Examples: Titles-1 86261. UltraClub Women's Classic Wrinkle-Free Long Sleeve Oxford Shirt, Pink, XX-Large 115042. UltraClub Ladies Classic Wrinkle-Free Long-Sleeve Oxford Light Blue XXL TItles-2 75004. EVALY Women's Cool University Of UTAH 3/4 Sleeve Raglan Tee 109225. EVALY Women's Unique University Of UTAH 3/4 Sleeve Raglan Tees 120832. EVALY Women's New University Of UTAH 3/4-Sleeve Raglan Tshirt
# This code runs in O(n^2) time.
indices = []
for i,row in data.iterrows():
indices.append(i)
stage2_dedupe_asins = []
while len(indices)!=0:
i = indices.pop()
stage2_dedupe_asins.append(data['asin'].loc[i])
# consider the first apperal's title
a = data['title'].loc[i].split()
# store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
for j in indices:
b = data['title'].loc[j].split()
# store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
length = max(len(a),len(b))
# count is used to store the number of words that are matched in both strings
count = 0
for k in itertools.zip_longest(a,b):
if (k[0]==k[1]):
count += 1
# if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them
if (length - count) < 3:
indices.remove(j)
# from whole previous products we will consider only
# the products that are found in previous cell
data = data.loc[data['asin'].isin(stage2_dedupe_asins)]
print('Number of data points after stage two of dedupe: ',data.shape[0])
# from 17k apperals we reduced to 16k apperals
Number of data points after stage two of dedupe: 16551
data.to_pickle('../data/data_backup')
# Storing these products in a pickle file
# candidates who wants to download these files instead
# of 180K they can download and use them from the Google Drive folder.
data = pd.read_pickle('../data/data_backup')
# NLTK download stop words. [RUN ONLY ONCE]
# goto Terminal (Linux/Mac) or Command-Prompt (Window)
# In the temrinal, type these commands
# $python3
# $import nltk
# $nltk.download()
import nltk
nltk.download("popular")
[nltk_data] Downloading collection 'popular' [nltk_data] | [nltk_data] | Downloading package cmudict to /root/nltk_data... [nltk_data] | Unzipping corpora/cmudict.zip. [nltk_data] | Downloading package gazetteers to /root/nltk_data... [nltk_data] | Unzipping corpora/gazetteers.zip. [nltk_data] | Downloading package genesis to /root/nltk_data... [nltk_data] | Unzipping corpora/genesis.zip. [nltk_data] | Downloading package gutenberg to /root/nltk_data... [nltk_data] | Unzipping corpora/gutenberg.zip. [nltk_data] | Downloading package inaugural to /root/nltk_data... [nltk_data] | Unzipping corpora/inaugural.zip. [nltk_data] | Downloading package movie_reviews to [nltk_data] | /root/nltk_data... [nltk_data] | Unzipping corpora/movie_reviews.zip. [nltk_data] | Downloading package names to /root/nltk_data... [nltk_data] | Unzipping corpora/names.zip. [nltk_data] | Downloading package shakespeare to /root/nltk_data... [nltk_data] | Unzipping corpora/shakespeare.zip. [nltk_data] | Downloading package stopwords to /root/nltk_data... [nltk_data] | Unzipping corpora/stopwords.zip. [nltk_data] | Downloading package treebank to /root/nltk_data... [nltk_data] | Unzipping corpora/treebank.zip. [nltk_data] | Downloading package twitter_samples to [nltk_data] | /root/nltk_data... [nltk_data] | Unzipping corpora/twitter_samples.zip. [nltk_data] | Downloading package omw to /root/nltk_data... [nltk_data] | Unzipping corpora/omw.zip. [nltk_data] | Downloading package wordnet to /root/nltk_data... [nltk_data] | Unzipping corpora/wordnet.zip. [nltk_data] | Downloading package wordnet31 to /root/nltk_data... [nltk_data] | Unzipping corpora/wordnet31.zip. [nltk_data] | Downloading package wordnet_ic to /root/nltk_data... [nltk_data] | Unzipping corpora/wordnet_ic.zip. [nltk_data] | Downloading package words to /root/nltk_data... [nltk_data] | Unzipping corpora/words.zip. [nltk_data] | Downloading package maxent_ne_chunker to [nltk_data] | /root/nltk_data... [nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip. [nltk_data] | Downloading package punkt to /root/nltk_data... [nltk_data] | Unzipping tokenizers/punkt.zip. [nltk_data] | Downloading package snowball_data to [nltk_data] | /root/nltk_data... [nltk_data] | Downloading package averaged_perceptron_tagger to [nltk_data] | /root/nltk_data... [nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip. [nltk_data] | [nltk_data] Done downloading collection popular
True
# we use the list of stop words that are downloaded from nltk lib.
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)
def nlp_preprocessing(total_text, index, column):
if type(total_text) is not int:
string = ""
for words in total_text.split():
# remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
word = ("".join(e for e in words if e.isalnum()))
# Conver all letters to lower-case
word = word.lower()
# stop-word removal
if not word in stop_words:
string += word + " "
data[column][index] = string
list of stop words: {'our', 'with', 'while', 'ain', 'how', 'this', 'which', "should've", 'during', 'there', 'she', 'he', 'where', "it's", 'but', 'itself', 'that', 'a', 'and', "couldn't", 'after', 'whom', 'once', 'them', 'some', 'wouldn', 'did', 'does', 't', 'were', 'him', 'now', "haven't", 'am', 'herself', "doesn't", 'was', 'if', 'each', 'didn', 'theirs', 'are', "she's", 'be', 'll', 'those', 'until', 'against', 'been', 'myself', 'such', 'just', 'shan', 'of', 'here', 'an', "wouldn't", 'no', 'too', 'your', 'before', "don't", 'on', 'couldn', 'below', "you'd", 'only', 'has', 'or', "didn't", 'they', 'off', "needn't", "won't", 're', 'their', "shouldn't", 'between', 'wasn', 'm', "mightn't", 'mustn', 'hadn', 'aren', 'her', 'd', 'i', 'by', 'into', 'why', 'you', 'because', 'for', 'its', 'do', 'same', 'y', "you've", 'having', 'in', 'haven', 'is', 'further', 'yours', 'again', 'any', "that'll", 'both', 'have', 'doesn', 'weren', 'what', 'all', 'the', 'out', 'my', 'it', 'from', 'as', 'very', 'when', "wasn't", 'won', "you're", 'don', 'themselves', 'own', 'most', 'up', 'other', 'isn', 'had', 'will', 'who', 'doing', "aren't", 's', 'through', 'ours', 'ma', 'yourselves', 'his', 'about', 'we', 'at', 'these', "weren't", 'not', 'should', 'than', "shan't", 'himself', 'under', 'shouldn', "you'll", 'ourselves', 'nor', 'so', 'can', 'hasn', 'being', 'me', "isn't", 'over', 'o', 'hers', 'down', 'to', 'few', "hasn't", 'yourself', 'then', 'mightn', 'needn', 'more', "hadn't", 'above', "mustn't", 've'}
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data.iterrows():
nlp_preprocessing(row['title'], index, 'title')
print("The time it took to preprocess whole titles", time.clock() - start_time, "seconds")
The time it took to preprocess whole titles 6.267260999999962 seconds
data.to_pickle('../data/data_preprocessed_backup')
data = pd.read_pickle('../data/data_preprocessed_backup')
#Display an image
def display_img(url,ax,fig):
# we get the url of the apparel and download it
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# we will display it in notebook
plt.imshow(img)
#plotting code to understand the algorithm's decision.
def plot_heatmap(keys, values, labels, url, text):
# keys: list of words of recommended title
# values: len(values) == len(keys), values(i) represents the occurence of the word keys(i)
# labels: len(labels) == len(keys), the values of labels depends on the model we are using
# if model == 'bag of words': labels(i) = values(i)
# if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
# if model == 'idf weighted bag of words':labels(i) = idf(keys(i))
# url : apparel's url
# we will devide the whole figure into two parts
gs = gridspec.GridSpec(2, 2, width_ratios=[4,1], height_ratios=[4,1])
fig = plt.figure(figsize=(25,3))
# 1st, ploting heat map that represents the count of commonly ocurred words in title2
ax = plt.subplot(gs[0])
# it displays a cell in white color if the word is intersection(lis of words of title1 and list of words of title2), in black if not
ax = sns.heatmap(np.array([values]), annot=np.array([labels]))
ax.set_xticklabels(keys) # set that axis labels as the words of title
ax.set_title(text) # apparel title
# 2nd, plotting image of the the apparel
ax = plt.subplot(gs[1])
# we don't want any grid lines for image and no labels on x-axis and y-axis
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
# we call dispaly_img based with paramete url
display_img(url, ax, fig)
# displays combine figure ( heat map and image together)
plt.show()
def plot_heatmap_image(doc_id, vec1, vec2, url, text, model):
# doc_id : index of the title1
# vec1 : input apparels's vector, it is of a dict type {word:count}
# vec2 : recommended apparels's vector, it is of a dict type {word:count}
# url : apparels image url
# text: title of recomonded apparel (used to keep title of image)
# model, it can be any of the models,
# 1. bag_of_words
# 2. tfidf
# 3. idf
# we find the common words in both titles, because these only words contribute to the distance between two title vec's
intersection = set(vec1.keys()) & set(vec2.keys())
# we set the values of non intersecting words to zero, this is just to show the difference in heatmap
for i in vec2:
if i not in intersection:
vec2[i]=0
# for labeling heatmap, keys contains list of all words in title2
keys = list(vec2.keys())
# if ith word in intersection(lis of words of title1 and list of words of title2): values(i)=count of that word in title2 else values(i)=0
values = [vec2[x] for x in vec2.keys()]
# labels: len(labels) == len(keys), the values of labels depends on the model we are using
# if model == 'bag of words': labels(i) = values(i)
# if model == 'tfidf weighted bag of words':labels(i) = tfidf(keys(i))
# if model == 'idf weighted bag of words':labels(i) = idf(keys(i))
if model == 'bag_of_words':
labels = values
elif model == 'tfidf':
labels = []
for x in vec2.keys():
# tfidf_title_vectorizer.vocabulary_ it contains all the words in the corpus
# tfidf_title_features[doc_id, index_of_word_in_corpus] will give the tfidf value of word in given document (doc_id)
if x in tfidf_title_vectorizer.vocabulary_:
labels.append(tfidf_title_features[doc_id, tfidf_title_vectorizer.vocabulary_[x]])
else:
labels.append(0)
elif model == 'idf':
labels = []
for x in vec2.keys():
# idf_title_vectorizer.vocabulary_ it contains all the words in the corpus
# idf_title_features[doc_id, index_of_word_in_corpus] will give the idf value of word in given document (doc_id)
if x in idf_title_vectorizer.vocabulary_:
labels.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[x]])
else:
labels.append(0)
plot_heatmap(keys, values, labels, url, text)
# this function gets a list of wrods along with the frequency of each
# word given "text"
def text_to_vector(text):
word = re.compile(r'\w+')
words = word.findall(text)
# words stores list of all words in given string, you can try 'words = text.split()' this will also gives same result
return Counter(words) # Counter counts the occurence of each word in list, it returns dict type object {word1:count}
def get_result(doc_id, content_a, content_b, url, model):
text1 = content_a
text2 = content_b
# vector1 = dict{word11:#count, word12:#count, etc.}
vector1 = text_to_vector(text1)
# vector1 = dict{word21:#count, word22:#count, etc.}
vector2 = text_to_vector(text2)
plot_heatmap_image(doc_id, vector1, vector2, url, text2, model)
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer = CountVectorizer()
title_features = title_vectorizer.fit_transform(data['title'])
title_features.get_shape() # getting number of rows and columns in feature matrix.
(16551, 12732)
def bag_of_words_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(title_features,title_features[doc_id])
# np.argsort will return indices of the smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
# we will pass 1. doc_id, 2. title1, 3. title2, url, model
get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
print('ASIN :',data['asin'].loc[df_indices[i]])
print ('Brand:', data['brand'].loc[df_indices[i]])
print ('Title:', data['title'].loc[df_indices[i]])
print ('Euclidean similarity with the query image :', pdists[i])
print('='*60)
#call the bag-of-words model for a product to get similar products.
bag_of_words_model(12566, 5) # change the index if you want to.
# In the output heat map each value represents the count value
# of the label word, the color represents the intersection
# with inputs title.
ASIN : B072M2W79L Brand: Almost Famous Title: juniors almost famous rib henley swing top black size xl Euclidean similarity with the query image : 0.0 ============================================================
ASIN : B01MAZCL6C Brand: Almost Famous Title: almost famous swing henley top size l Euclidean similarity with the query image : 2.0 ============================================================
ASIN : B06XRPPVQC Brand: Almost Famous Title: almost famous juniors longsleeve black top size Euclidean similarity with the query image : 2.23606797749979 ============================================================
ASIN : B01M3RLP9B Brand: Almost Famous Title: almost famous juniors wine top size Euclidean similarity with the query image : 2.449489742783178 ============================================================
ASIN : B01MR9VMSE Brand: Almost Famous Title: almost famous highneck tank top juniors size xl Euclidean similarity with the query image : 2.449489742783178 ============================================================
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])
def tfidf_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(tfidf_title_features,tfidf_title_features[doc_id])
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
# we will pass 1. doc_id, 2. title1, 3. title2, url, model
get_result(indices[i], data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'tfidf')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('BRAND :',data['brand'].loc[df_indices[i]])
print ('Eucliden distance from the given image :', pdists[i])
print('='*125)
tfidf_model(12566, 5)
# in the output heat map each value represents the tfidf values of the label word, the color represents the intersection with inputs title
ASIN : B074NC9MJM BRAND : XCVI Eucliden distance from the given image : 0.0 =============================================================================================================================
ASIN : B071VY48VC BRAND : XCVI Eucliden distance from the given image : 1.15704694867847 =============================================================================================================================
ASIN : B01N7V7SIG BRAND : XCVI Eucliden distance from the given image : 1.1718496553150264 =============================================================================================================================
ASIN : B01M31Q4Z0 BRAND : XCVI Eucliden distance from the given image : 1.2098060863159672 =============================================================================================================================
ASIN : B01M7U5BHH BRAND : XCVI Eucliden distance from the given image : 1.22960337478935 =============================================================================================================================
idf_title_vectorizer = CountVectorizer()
idf_title_features = idf_title_vectorizer.fit_transform(data['title'])
def n_containing(word):
# return the number of documents which had the given word
return sum(1 for blob in data['title'] if word in blob.split())
def idf(word):
# idf = log(#number of docs / #number of docs which had the given word)
return math.log(data.shape[0] / (n_containing(word)))
# we need to convert the values into float
idf_title_features = idf_title_features.astype(np.float)
for i in idf_title_vectorizer.vocabulary_.keys():
# for every word in whole corpus we will find its idf value
idf_val = idf(i)
# to calculate idf_title_features we need to replace the count values with the idf values of the word
# idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0] will return all documents in which the word i present
for j in idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
# we replace the count values of word i in document j with idf_value of word i
# idf_title_features[doc_id, index_of_word_in_courpus] = idf value of word
idf_title_features[j,idf_title_vectorizer.vocabulary_[i]] = idf_val
def idf_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(idf_title_features,idf_title_features[doc_id])
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0,len(indices)):
get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'idf')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('Brand :',data['brand'].loc[df_indices[i]])
print ('euclidean distance from the given image :', pdists[i])
print('='*125)
idf_model(12566,5)
# in the output heat map each value represents the idf values of the label word, the color represents the intersection with inputs title
ASIN : B074NC9MJM Brand : XCVI euclidean distance from the given image : 0.0 =============================================================================================================================
ASIN : B071VY48VC Brand : XCVI euclidean distance from the given image : 18.512249582933418 =============================================================================================================================
ASIN : B01N7V7SIG Brand : XCVI euclidean distance from the given image : 18.681620268801424 =============================================================================================================================
ASIN : B01M31Q4Z0 Brand : XCVI euclidean distance from the given image : 19.29092855678568 =============================================================================================================================
ASIN : B00KF2N5PU Brand : Vietsbay euclidean distance from the given image : 19.69345982566793 =============================================================================================================================
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
with open('../data/word2vec_model', 'rb') as handle:
model = pickle.load(handle)
# Utility functions
def get_word_vec(sentence, doc_id, m_name):
# sentence : title of the apparel
# doc_id: document id in our corpus
# m_name: model information it will take two values
# if m_name == 'avg', we will append the model[i], w2v representation of word i
# if m_name == 'weighted', we will multiply each w2v[word] with the idf(word)
vec = []
for i in sentence.split():
if i in vocab:
if m_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
vec.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[i]] * model[i])
elif m_name == 'avg':
vec.append(model[i])
else:
# if the word in our courpus is not there in the google word2vec corpus, we are just ignoring it
vec.append(np.zeros(shape=(300,)))
# we will return a numpy array of shape (#number of words in title * 300 ) 300 = len(w2v_model[word])
# each row represents the word2vec representation of each word (weighted/avg) in given sentance
return np.array(vec)
def get_distance(vec1, vec2):
# vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
# vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
final_dist = []
# for each vector in vec1 we caluclate the distance(euclidean) to all vectors in vec2
for i in vec1:
dist = []
for j in vec2:
# np.linalg.norm(i-j) will result the euclidean distance between vectors i, j
dist.append(np.linalg.norm(i-j))
final_dist.append(np.array(dist))
# final_dist = np.array(#number of words in title1 * #number of words in title2)
# final_dist[i,j] = euclidean distance between vectors i, j
return np.array(final_dist)
def heat_map_w2v(sentence1, sentence2, url, doc_id1, doc_id2, model):
# sentance1 : title1, input apparel
# sentance2 : title2, recommended apparel
# url: apparel image url
# doc_id1: document id of input apparel
# doc_id2: document id of recommended apparel
# model: it can have two values, 1. avg 2. weighted
#s1_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s1_vec = get_word_vec(sentence1, doc_id1, model)
#s2_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s2_vec = get_word_vec(sentence2, doc_id2, model)
# s1_s2_dist = np.array(#number of words in title1 * #number of words in title2)
# s1_s2_dist[i,j] = euclidean distance between words i, j
s1_s2_dist = get_distance(s1_vec, s2_vec)
# devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of apparel
gs = gridspec.GridSpec(2, 2, width_ratios=[4,1],height_ratios=[2,1])
fig = plt.figure(figsize=(15,15))
ax = plt.subplot(gs[0])
# ploting the heap map based on the pairwise distances
ax = sns.heatmap(np.round(s1_s2_dist,4), annot=True)
# set the x axis labels as recommended apparels title
ax.set_xticklabels(sentence2.split())
# set the y axis labels as input apparels title
ax.set_yticklabels(sentence1.split())
# set title as recommended apparels title
ax.set_title(sentence2)
ax = plt.subplot(gs[1])
# we remove all grids and axis labels for image
ax.grid(False)
ax.set_xticks([])
ax.set_yticks([])
display_img(url, ax, fig)
plt.show()
# vocab = stores all the words that are there in google w2v model
# vocab = model.wv.vocab.keys() # if you are using Google word2Vec
vocab = model.keys()
# this function will add the vectors of each word and returns the avg vector of given sentance
def build_avg_vec(sentence, num_features, doc_id, m_name):
# sentace: its title of the apparel
# num_features: the lenght of word2vec vector, its values = 300
# m_name: model information it will take two values
# if m_name == 'avg', we will append the model[i], w2v representation of word i
# if m_name == 'weighted', we will multiply each w2v[word] with the idf(word)
featureVec = np.zeros((num_features,), dtype="float32")
# we will intialize a vector of size 300 with all zeros
# we add each word2vec(wordi) to this fetureVec
nwords = 0
for word in sentence.split():
nwords += 1
if word in vocab:
if m_name == 'weighted' and word in idf_title_vectorizer.vocabulary_:
featureVec = np.add(featureVec, idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[word]] * model[word])
elif m_name == 'avg':
featureVec = np.add(featureVec, model[word])
if(nwords>0):
featureVec = np.divide(featureVec, nwords)
# returns the avg vector of given sentance, its of shape (1, 300)
return featureVec
doc_id = 0
w2v_title = []
# for every title we build a avg vector representation
for i in data['title']:
w2v_title.append(build_avg_vec(i, 300, doc_id,'avg'))
doc_id += 1
# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc
w2v_title = np.array(w2v_title)
def avg_w2v_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
pairwise_dist = pairwise_distances(w2v_title, w2v_title[doc_id].reshape(1,-1))
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0, len(indices)):
heat_map_w2v(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'avg')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('BRAND :',data['brand'].loc[df_indices[i]])
print ('euclidean distance from given input image :', pdists[i])
print('='*125)
avg_w2v_model(12566, 5)
# in the give heat map, each cell contains the euclidean distance between words i, j
ASIN : B074NC9MJM BRAND : XCVI euclidean distance from given input image : 0.0 =============================================================================================================================
ASIN : B001EVK62Q BRAND : hotel Collections euclidean distance from given input image : 0.5927989 =============================================================================================================================
ASIN : B014MU8IZW BRAND : Elementz euclidean distance from given input image : 0.6163741 =============================================================================================================================
ASIN : B00FZ9G5DC BRAND : Nikibiki euclidean distance from given input image : 0.6557259 =============================================================================================================================
ASIN : B0719286XL BRAND : Catherine Malandrino euclidean distance from given input image : 0.7018532 =============================================================================================================================
ASIN : B0716QB15B BRAND : Cupio euclidean distance from given input image : 0.7022328 =============================================================================================================================
ASIN : B073WGTX9F BRAND : Thalia euclidean distance from given input image : 0.7059877 =============================================================================================================================
ASIN : B06XVM869S BRAND : Zadig & Voltaire euclidean distance from given input image : 0.72250324 =============================================================================================================================
ASIN : B0756SQ1Q3 BRAND : Worthington euclidean distance from given input image : 0.7278088 =============================================================================================================================
ASIN : B01HUQJXRW BRAND : Bisou Bisou euclidean distance from given input image : 0.7282631 =============================================================================================================================
ASIN : B071DY3716 BRAND : Laundry by Shelli Segal euclidean distance from given input image : 0.73314846 =============================================================================================================================
ASIN : B06XVZS5W4 BRAND : Worthington euclidean distance from given input image : 0.7345943 =============================================================================================================================
ASIN : B01GV3O5DW BRAND : Almost Famous euclidean distance from given input image : 0.73595756 =============================================================================================================================
ASIN : B01MD2I474 BRAND : Bisou Bisou euclidean distance from given input image : 0.7372231 =============================================================================================================================
ASIN : B01N6D6E6Y BRAND : Style & Co. euclidean distance from given input image : 0.738461 =============================================================================================================================
ASIN : B017FAUGQA BRAND : Elementz euclidean distance from given input image : 0.7446933 =============================================================================================================================
ASIN : B06XXLZL9G BRAND : BCX euclidean distance from given input image : 0.74887747 =============================================================================================================================
ASIN : B06VSLCVTY BRAND : Sofie & Sam euclidean distance from given input image : 0.75182325 =============================================================================================================================
ASIN : B071VHFC47 BRAND : Worthington euclidean distance from given input image : 0.75822365 =============================================================================================================================
ASIN : B015PDWOS2 BRAND : Kjus euclidean distance from given input image : 0.760332 =============================================================================================================================
doc_id = 0
w2v_title_weight = []
# for every title we build a weighted vector representation
for i in data['title']:
w2v_title_weight.append(build_avg_vec(i, 300, doc_id,'weighted'))
doc_id += 1
# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc
w2v_title_weight = np.array(w2v_title_weight)
def weighted_w2v_model(doc_id, num_results):
# doc_id: apparel's id in given corpus
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
pairwise_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1,-1))
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
df_indices = list(data.index[indices])
for i in range(0, len(indices)):
heat_map_w2v(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'weighted')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('Brand :',data['brand'].loc[df_indices[i]])
print('euclidean distance from input :', pdists[i])
print('='*125)
weighted_w2v_model(12566, 5)
# in the give heat map, each cell contains the euclidean distance between words i, j
ASIN : B074NC9MJM Brand : XCVI euclidean distance from input : 0.0 =============================================================================================================================
ASIN : B074N6DQD2 Brand : Sufei euclidean distance from input : 3.858672 =============================================================================================================================
ASIN : B01KIETDV4 Brand : Rossmore euclidean distance from input : 3.862221 =============================================================================================================================
ASIN : B06XH2N9SG Brand : Retro-ology euclidean distance from input : 3.8688939 =============================================================================================================================
ASIN : B015IXMGLY Brand : Nikibiki euclidean distance from input : 3.9017062 =============================================================================================================================
# some of the brand values are empty.
# Need to replace Null with string "NULL"
data['brand'].fillna(value="Not given", inplace=True )
# replace spaces with hypen
brands = [x.replace(" ", "-") for x in data['brand'].values]
types = [x.replace(" ", "-") for x in data['product_type_name'].values]
colors = [x.replace(" ", "-") for x in data['color'].values]
brand_vectorizer = CountVectorizer()
brand_features = brand_vectorizer.fit_transform(brands)
type_vectorizer = CountVectorizer()
type_features = type_vectorizer.fit_transform(types)
color_vectorizer = CountVectorizer()
color_features = color_vectorizer.fit_transform(colors)
extra_features = hstack((brand_features, type_features, color_features)).tocsr()
def heat_map_w2v_brand(sentance1, sentance2, url, doc_id1, doc_id2, df_id1, df_id2, model):
# sentance1 : title1, input apparel
# sentance2 : title2, recommended apparel
# url: apparel image url
# doc_id1: document id of input apparel
# doc_id2: document id of recommended apparel
# df_id1: index of document1 in the data frame
# df_id2: index of document2 in the data frame
# model: it can have two values, 1. avg 2. weighted
#s1_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s1_vec = get_word_vec(sentance1, doc_id1, model)
#s2_vec = np.array(#number_of_words_title2 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
s2_vec = get_word_vec(sentance2, doc_id2, model)
# s1_s2_dist = np.array(#number of words in title1 * #number of words in title2)
# s1_s2_dist[i,j] = euclidean distance between words i, j
s1_s2_dist = get_distance(s1_vec, s2_vec)
data_matrix = [['Asin','Brand', 'Color', 'Product type'],
[data['asin'].loc[df_id1],brands[doc_id1], colors[doc_id1], types[doc_id1]], # input apparel's features
[data['asin'].loc[df_id2],brands[doc_id2], colors[doc_id2], types[doc_id2]]] # recommonded apparel's features
colorscale = [[0, '#1d004d'],[.5, '#f2e5ff'],[1, '#f2e5d1']] # to color the headings of each column
# we create a table with the data_matrix
table = ff.create_table(data_matrix, index=True, colorscale=colorscale)
# plot it with plotly
plotly.offline.iplot(table, filename='simple_table')
# devide whole figure space into 25 * 1:10 grids
gs = gridspec.GridSpec(25, 15)
fig = plt.figure(figsize=(25,5))
# in first 25*10 grids we plot heatmap
ax1 = plt.subplot(gs[:, :-5])
# ploting the heap map based on the pairwise distances
ax1 = sns.heatmap(np.round(s1_s2_dist,6), annot=True)
# set the x axis labels as recommended apparels title
ax1.set_xticklabels(sentance2.split())
# set the y axis labels as input apparels title
ax1.set_yticklabels(sentance1.split())
# set title as recommended apparels title
ax1.set_title(sentance2)
# in last 25 * 10:15 grids we display image
ax2 = plt.subplot(gs[:, 10:16])
# we dont display grid lins and axis labels to images
ax2.grid(False)
ax2.set_xticks([])
ax2.set_yticks([])
# pass the url it display it
display_img(url, ax2, fig)
plt.show()
def idf_w2v_brand(doc_id, w1, w2, num_results):
# doc_id: apparel's id in given corpus
# w1: weight for w2v features
# w2: weight for brand and color features
# pairwise_dist will store the distance from given input apparel to all remaining apparels
# the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
# http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
idf_w2v_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1,-1))
ex_feat_dist = pairwise_distances(extra_features, extra_features[doc_id])
pairwise_dist = (w1 * idf_w2v_dist + w2 * ex_feat_dist)/float(w1 + w2)
# np.argsort will return indices of 9 smallest distances
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
#pdists will store the 9 smallest distances
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
#data frame indices of the 9 smallest distace's
df_indices = list(data.index[indices])
for i in range(0, len(indices)):
heat_map_w2v_brand(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i],df_indices[0], df_indices[i], 'weighted')
print('ASIN :',data['asin'].loc[df_indices[i]])
print('Brand :',data['brand'].loc[df_indices[i]])
print('euclidean distance from input :', pdists[i])
print('='*125)
idf_w2v_brand(12566, 5, 5, 5)
# in the give heat map, each cell contains the euclidean distance between words i, j
ASIN : B074NC9MJM Brand : XCVI euclidean distance from input : 0.0 =============================================================================================================================
ASIN : B01N7V7SIG Brand : XCVI euclidean distance from input : 2.7072637557983397 =============================================================================================================================
ASIN : B06ZZX46Z6 Brand : Bobi euclidean distance from input : 2.9091387308108057 =============================================================================================================================
ASIN : B019EGEV9Q Brand : Trendyloosefit euclidean distance from input : 2.9831577814089503 =============================================================================================================================
ASIN : B06X16RKT8 Brand : ECI euclidean distance from input : 2.9977934396731105 =============================================================================================================================
ASIN : B0718Y9J4M Brand : f euclidean distance from input : 3.0040958404541014 =============================================================================================================================
ASIN : B0718ZFSY7 Brand : BODEN euclidean distance from input : 3.0045811212527003 =============================================================================================================================
ASIN : B01L61OJNU Brand : DSQUARED2 euclidean distance from input : 3.035955098913833 =============================================================================================================================
ASIN : B074N6DQD2 Brand : Sufei euclidean distance from input : 3.0473699643968675 =============================================================================================================================
ASIN : B01KIETDV4 Brand : Rossmore euclidean distance from input : 3.0491445615648365 =============================================================================================================================
ASIN : B071DN2JDX Brand : BODEN euclidean distance from input : 3.065048459814712 =============================================================================================================================
ASIN : B015IXMGLY Brand : Nikibiki euclidean distance from input : 3.0688871457933518 =============================================================================================================================
ASIN : B014LKPLYE Brand : Rossmore euclidean distance from input : 3.0719467237352465 =============================================================================================================================
ASIN : B074SPFNSW Brand : Sufei euclidean distance from input : 3.0759361341356373 =============================================================================================================================
ASIN : B0759R8F7G Brand : Everleigh euclidean distance from input : 3.0810888364671802 =============================================================================================================================
ASIN : B01L61KCHC Brand : DSQUARED2 euclidean distance from input : 3.083333257483169 =============================================================================================================================
ASIN : B0711Z8LP7 Brand : bellatrix euclidean distance from input : 3.0892168119310472 =============================================================================================================================
ASIN : B0155JBTLO Brand : Studio M euclidean distance from input : 3.091695073889419 =============================================================================================================================
ASIN : B01N65XQOG Brand : DSQUARED2 euclidean distance from input : 3.093203361308489 =============================================================================================================================
ASIN : B074MJPLCB Brand : BollyDoll euclidean distance from input : 3.0945304944871994 =============================================================================================================================
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import requests
from PIL import Image
import pandas as pd
import pickle
Using TensorFlow backend.
# https://gist.github.com/fchollet/f35fbc80e066a49d65f1688a7e99f069
# Code reference: https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html
# This code takes 40 minutes to run on a modern GPU (graphics card)
# like Nvidia 1050.
# GPU (NVidia 1050): 0.175 seconds per image
# This code takes 160 minutes to run on a high end i7 CPU
# CPU (i7): 0.615 seconds per image.
# each image is converted into 25088 length dense-vector
# dimensions of our images.
img_width, img_height = 224, 224
top_model_weights_path = 'bottleneck_fc_model.h5'
train_data_dir = 'images2/'
nb_train_samples = 16042
epochs = 50
batch_size = 1
def save_bottlebeck_features():
#Function to compute VGG-16 CNN for image feature extraction.
asins = []
datagen = ImageDataGenerator(rescale=1. / 255)
# build the VGG16 network
model = applications.VGG16(include_top=False, weights='imagenet')
generator = datagen.flow_from_directory(
train_data_dir,
target_size=(img_width, img_height),
batch_size=batch_size,
class_mode=None,
shuffle=False)
for i in generator.filenames:
asins.append(i[2:-5])
bottleneck_features_train = model.predict_generator(generator, nb_train_samples // batch_size)
bottleneck_features_train = bottleneck_features_train.reshape((16042,25088))
np.save(open('../data/data_cnn_features.npy', 'wb'), bottleneck_features_train)
np.save(open('../data/data_cnn_feature_asins.npy', 'wb'), np.array(asins))
save_bottlebeck_features()
#load the features and corresponding ASINS info.
bottleneck_features_train = np.load('../data/data_cnn_features.npy')
asins = np.load('../data/data_cnn_feature_asins.npy')
asins = list(asins)
df_asins = list(data['asin'])
from IPython.display import display, Image, SVG, Math, YouTubeVideo
#get similar products using CNN features (VGG-16)
def get_similar_products_cnn(doc_id, num_results):
doc_id = asins.index(df_asins[doc_id])
pairwise_dist = pairwise_distances(bottleneck_features_train, bottleneck_features_train[doc_id].reshape(1,-1))
indices = np.argsort(pairwise_dist.flatten())[0:num_results]
pdists = np.sort(pairwise_dist.flatten())[0:num_results]
for i in range(len(indices)):
rows = data[['medium_image_url','title']].loc[data['asin']==asins[indices[i]]]
for indx, row in rows.iterrows():
display(Image(url=row['medium_image_url'], embed=True))
print('Product Title: ', row['title'])
print('Euclidean Distance from input image:', pdists[i])
print('Amazon Url: www.amzon.com/dp/'+ asins[indices[i]])
get_similar_products_cnn(12566, 5)
Product Title: xcvi romalyn top kamel wash saddle size large Euclidean Distance from input image: 6.180516e-06 Amazon Url: www.amzon.com/dp/B074NC9MJM
Product Title: cable gauge large dark blush peasant top Euclidean Distance from input image: 35.34564 Amazon Url: www.amzon.com/dp/B074XPJWZT
Product Title: style co womens metallic scoopneck pullover top gray Euclidean Distance from input image: 36.92667 Amazon Url: www.amzon.com/dp/B01N5K6P4E
Product Title: ideology raglan spaceddyed longsleeve top size xs Euclidean Distance from input image: 37.799213 Amazon Url: www.amzon.com/dp/B01MFDNJZ8